Patrick Litte, patrick.little@ryerson.ca Manjola Chiappetta, m1chiappetta@ryerson.ca
This section is a summary of the project.
instructions: Write an abstract (a kind of a summary) to describe your project. The abstract must be within 175 to 250 words (inclusive). To write the abstract, first state the problem you are addressing. For example, if your project is on Churn analysis, then give a brief explanation of it. Second, write the summary of your classification results (e.g., accuracy). Third, state key points about the post-predictive analysis and fourth, summarize your recommendations to the organization.
| Member Name | List of Tasks Preformed |
|---|---|
| Patrick Little | - some tasks |
| Manjola Chiappetta | - some tasks |
In this section we will: - Look at the attribute types in the dataset - Find and missing values - Find max,min,mean and standard deviation of the atttributes - Determine any outlier values for the attributes under consideration - Analyze the distribution of numeric attributes
bank<-read.csv("https://raw.githubusercontent.com/PatLittle/CIND119-group-project/main/bank_marketing/bank.csv")
introduce(bank)
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 4521 17 10 7 0
## total_missing_values complete_rows total_observations memory_usage
## 1 0 4521 76857 495152
plot_intro(bank)
plot_missing(bank)
#plot_bar(bank, by = "y")
plot_histogram(bank)
plot_correlation(na.omit(bank), type = "d")
plot_prcomp(bank, variance_cap = 0.9, ncol =1L, nrow=1L)
str(bank)
## 'data.frame': 4521 obs. of 17 variables:
## $ age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ job : chr "unemployed" "services" "management" "management" ...
## $ marital : chr "married" "married" "single" "married" ...
## $ education: chr "primary" "secondary" "tertiary" "tertiary" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : chr "no" "yes" "yes" "yes" ...
## $ loan : chr "no" "yes" "no" "yes" ...
## $ contact : chr "cellular" "cellular" "cellular" "unknown" ...
## $ day : int 19 11 16 3 5 23 14 6 14 17 ...
## $ month : chr "oct" "may" "apr" "jun" ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : chr "unknown" "failure" "failure" "unknown" ...
## $ y : chr "no" "no" "no" "no" ...
colSums(is.na(bank))
## age job marital education default balance housing loan
## 0 0 0 0 0 0 0 0
## contact day month duration campaign pdays previous poutcome
## 0 0 0 0 0 0 0 0
## y
## 0
###Decision Tree
bank_clean<- bank %>% mutate_if(is.character, factor)
set.seed(888)
bank_split <- initial_split(bank_clean, prop = 0.75,
strata = y)
bank_training <- bank_split %>% training()
bank_test <- bank_split %>% testing()
bank_folds <- vfold_cv(bank_training, v = 10)
bank_recipe <- recipe(y ~ ., data = bank_training)
bank_clean_baked<-bank_recipe %>%
prep() %>%
bake(new_data = bank_training)
tree_model <- decision_tree(cost_complexity = tune(),
tree_depth = tune(),
min_n = tune()) %>%
set_engine('rpart') %>%
set_mode('classification')
tree_workflow <- workflow() %>%
add_model(tree_model) %>%
add_recipe(bank_recipe)
tree_grid <- grid_latin_hypercube(cost_complexity(),
tree_depth(),
min_n(),
size = 60)
set.seed(888)
tree_tuning <- tree_workflow %>%
tune_grid(resamples = bank_folds,
grid = tree_grid)
## Warning: package 'vctrs' was built under R version 4.0.5
tree_tuning %>% show_best('roc_auc')
## # A tibble: 5 x 9
## cost_complexity tree_depth min_n .metric .estimator mean n std_err
## <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl>
## 1 0.000000000688 10 20 roc_auc binary 0.863 10 0.00653
## 2 0.00000448 9 28 roc_auc binary 0.862 10 0.00615
## 3 0.00000322 9 27 roc_auc binary 0.862 10 0.00623
## 4 0.00000000641 11 24 roc_auc binary 0.860 10 0.00646
## 5 0.00000874 11 23 roc_auc binary 0.860 10 0.00644
## # ... with 1 more variable: .config <chr>
best_tree <- tree_tuning %>%
select_best(metric = 'roc_auc')
final_tree_workflow <- tree_workflow %>%
finalize_workflow(best_tree)
tree_wf_fit <- final_tree_workflow %>%
fit(data = bank_training)
tree_fit <- tree_wf_fit %>%
pull_workflow_fit()
vip(tree_fit)
rpart.plot(tree_fit$fit, roundint = FALSE)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
tree_last_fit <- final_tree_workflow %>%
last_fit(bank_split)
tree_last_fit %>% collect_metrics()
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.892 Preprocessor1_Model1
## 2 roc_auc binary 0.827 Preprocessor1_Model1
tree_last_fit %>% collect_predictions() %>%
roc_curve(truth = y, estimate = .pred_no) %>%
autoplot()
tree_predictions <- tree_last_fit %>% collect_predictions()
conf_mat(tree_predictions, truth = y, estimate = .pred_class)
## Truth
## Prediction no yes
## no 949 71
## yes 51 59
predict(tree_last_fit$.workflow[[1]],bank_test[15,])
## # A tibble: 1 x 1
## .pred_class
## <fct>
## 1 no
saveRDS(tree_last_fit$.workflow[[1]],"./saved_model.Rds")
trained_model<-readRDS("saved_model.Rds")
set.seed(888)
nb_split <- initial_split(bank_clean, prop = 0.75,
strata = y)
nb_training <- nb_split %>% training()
nb_test <- nb_split %>% testing()
nb_folds <- vfold_cv(nb_training, v = 10)
nb_recipe <- recipe(y ~ ., data = nb_training)
nb_wf <- workflow() %>%
add_recipe(nb_recipe)
library(discrim)
##
## Attaching package: 'discrim'
## The following object is masked from 'package:dials':
##
## smoothness
nb_spec <- naive_Bayes() %>%
set_mode("classification") %>%
set_engine("naivebayes")
nb_spec
## Naive Bayes Model Specification (classification)
##
## Computational engine: naivebayes
nb_fit <- nb_wf %>%
add_model(nb_spec) %>%
fit(data = nb_training)
nb_wf_final <- workflow() %>%
add_recipe(nb_recipe) %>%
add_model(nb_spec)
nb_rs <- fit_resamples(
nb_wf_final,
nb_folds,
control = control_resamples(save_pred = TRUE)
)
nb_last_fit <- nb_wf_final %>%
last_fit(nb_split)
nb_last_fit %>% collect_metrics()
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.881 Preprocessor1_Model1
## 2 roc_auc binary 0.849 Preprocessor1_Model1
nb_last_fit %>% collect_predictions() %>%
roc_curve(truth = y, estimate = .pred_no) %>%
autoplot()
nb_predictions <- nb_last_fit %>% collect_predictions()
conf_mat(nb_predictions, truth = y, estimate = .pred_class)
## Truth
## Prediction no yes
## no 956 91
## yes 44 39
Some text wrapping up the report